Python doc_to_wordlist示例，func.nlp.doc_to_wordlist Python示例

示例#1

0

显示文件

文件： corpus2graph.py 项目： mehtakash93/Research_Work

def original_generate_token_graph():
    corp = []
    sentences = []      # Initialize an empty list of sentences
    
    input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ]
    
    for folder in input_folders:
        dir_path = dataset_folder + os.sep + folder + os.sep
        files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
        
        for file in files:
            file_path = dir_path + file
            file_name, file_extension = splitext(file_path)
            doc = ""
            
            if file_extension == ".pdf":
                doc = convert_pdf_to_txt(file_path)
            elif file_extension == ".docx":
                doc = convert_docx_to_txt(file_path)
            else:
                continue
                
            if doc != "":
                doc = doc.decode("utf8")
                #doc = words_to_phrases(doc)
                doc = doc.lower()
                doc = doc_to_wordlist(doc,True)
                corp = it.chain(corp,doc)
                #sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False)
    
    corp = list(corp)
    graph = nx.Graph()
    weights = Counter()
    edges = set()
    window = corp[0:5]
    
    for tup in it.permutations(window,2):
        weights[tup] += 1
    for i in range(3,len(corp)-2):
        for j in range(i-2,i+2):
            weights[(corp[j],corp[i+2])] += 1
            weights[(corp[i+2],corp[j])] += 1
            edges.add((corp[i+2],corp[j]))
            
    for e in edges:
        graph.add_edge(e[0], e[1], {'weight':weights[e]})
    
    print graph
    nx.write_weighted_edgelist(graph, "graph.g")
    print nx.to_numpy_matrix(graph)
    np.savetxt("graph.adj", nx.to_numpy_matrix(graph))
    print "finished"

示例#2

0

显示文件

文件： wiki_doc2vec_train_user_MN_2.py 项目： gargsajal9/journaling-analytics

    
        doc = ""
        user = ""
        
        if file_extension == ".pdf":
            user = file.split("_")[0]
            doc = convert_pdf_to_txt(file_path)
        elif file_extension == ".docx":
            user = file.split("_")[0]
            doc = convert_docx_to_txt(file_path)
        else:
            continue
            
        if doc != "":
            doc = doc.decode("utf8")
            doc = doc_to_wordlist(doc)

            if users.has_key(user):
                users[user] += doc
            else:
                users[user] = doc

        
print users.keys()
print "Number of users = " + str(len(users))


# Initial a vector of syn0 and syn1 for a vector of a label
new_syn0 = empty((1, model.layer1_size), dtype=REAL)
new_syn1 = empty((1, model.layer1_size), dtype=REAL)

示例#3

0

显示文件

文件： wiki_doc2vec_train_doc_journaling_2.py 项目： gargsajal9/journaling-analytics

                files = [ f for f in listdir(subsub_dir_path) if isfile(join(subsub_dir_path,f)) if ".pdf" in f ]
                
                for file_name in files:
                    file_paths.append(subsub_dir_path + file_name)
                    
    #print file_paths
    #print len(file_paths)
    
    label_i = 1
    
    for file_path in file_paths:
        doc = convert_pdf_to_txt(file_path)
        
        if doc != "":
            doc = doc.decode("utf8")
            doc = doc_to_wordlist(doc, remove_stopwords=True)
            label = str(folder) + "_" + str(label_i)
            label_i = label_i + 1
            
            doc_labels[label] = doc



# Initial a vector of syn0 and syn1 for a vector of a label
new_syn0 = empty((1, model.layer1_size), dtype=REAL)
new_syn1 = empty((1, model.layer1_size), dtype=REAL)

is_first = True

# Initialize and add a vector of syn0 and syn1 for a vector of a label
for doc_label in doc_labels:

示例#4

0

显示文件

文件： corpus2graph.py 项目： mehtakash93/Research_Work

def generate_token_group_graph(group="author",output="graph"):
    corp = defaultdict(list)
    sentences = []      # Initialize an empty list of sentences
    
    input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ]
    
    for folder in input_folders:
        dir_path = dataset_folder + os.sep + folder + os.sep
        files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
        
        for file in files:
            file_path = dir_path + file
            file_name, file_extension = splitext(file_path)
            doc = ""
            
            if file_extension == ".pdf":
                doc = convert_pdf_to_txt(file_path)
            elif file_extension == ".docx":
                doc = convert_docx_to_txt(file_path)
            elif file_extension == ".txt":
                with open(file_path) as f:
                    for line in f:
                        doc = doc+" "+line
            else:
                continue
                
            if doc != "":
                # Extract the author name from the filename
                if group is "author":
                    group = file_name.split("_")[0]
                elif group is "document":
                    group = file_name
                else:
                    group = ""
                    
                doc = doc.decode("utf8")
                doc = doc.lower()
                doc = doc_to_wordlist(doc,True)
                corp[group] = it.chain(corp[group],doc)
                #sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False)
    
    graph = nx.Graph()
    weights = Counter()
    edges = set()
    
    if group is "":
        coll = list(corp[""])
        window = coll[0:5]
        
        for tup in it.permutations(window,2):
            weights[tup] += 1
            weights[(tup[1],tup[0])] += 1
            edges.add(tup)
            
        for i in range(2,len(coll)-2):
            for j in range(i-2,i+2):
                weights[(coll[j],coll[i+2])] += 1
                weights[(coll[i+2],coll[j])] += 1
                edges.add((coll[i+2],coll[j]))
                
        for e in edges:
            graph.add_edge(e[0], e[1], {'weight':weights[e]})
    else:
        for (g,coll) in corp:
            coll = list(coll)
            window = coll[0:5]
            
            for tup in it.permutations(window,2):
                weights[tup] += 1
                weights[(tup[1],tup[0])] += 1
                edges.add(tup)
            
            for t in window:
                if not (t is g):
                    weights[(g,t)] += 1
                    weights[(t,g)] += 1
                    edges.add((g,t))
                
            for i in range(2,len(coll)-2):
                for j in range(i-2,i+2):
                    if not (coll[i+2] is g):
                        weights[(g,coll[i+2])] += 1
                        weights[(coll[i+2],g)] += 1
                        edges.add((g,coll[i+2]))

                    weights[(coll[j],coll[i+2])] += 1
                    weights[(coll[i+2],coll[j])] += 1
                    edges.add((coll[i+2],coll[j]))
                    
            for e in edges:
                graph.add_edge(e[0], e[1], {'weight':weights[e]})

    nx.write_weighted_edgelist(graph, output+".g")
    print nx.to_numpy_matrix(graph)
    np.savetxt(output+".adj", nx.to_numpy_matrix(graph))
    generate_graph_statistics(graph,output)
    print "finished"

示例#5

0

显示文件

文件： corpus2graph.py 项目： mehtakash93/Research_Work

def generate_token_group_bigraph(group="author",output="graph", threshold=5):
    corp = defaultdict(list)
    sentences = []      # Initialize an empty list of sentences
    
    input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ]
    
    for folder in input_folders:
        dir_path = dataset_folder + os.sep + folder + os.sep
        files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
        
        for file in files:
            file_path = dir_path + file
            file_name, file_extension = splitext(file_path)
            doc = ""
            
            if file_extension == ".pdf":
                doc = convert_pdf_to_txt(file_path)
            elif file_extension == ".docx":
                doc = convert_docx_to_txt(file_path)
            elif file_extension == ".txt":
                with open(file_path) as f:
                    for line in f:
                        doc = doc+" "+line
            else:
                continue
                
            if doc != "":
                # Extract the author name from the filename
                if group is "author":
                    # Before I just consider the author another token.
                    # Now I need to distinguish the author tokens from
                    # the author identifier
                    group = "a_"+file_name.split("_")[0]
                elif group is "document":
                    group = file_name
                else:
                    group = ""
                    
                doc = doc.decode("utf8")
                doc = doc.lower()
                doc = doc_to_wordlist(doc,True)
                corp[group] = it.chain(corp[group],doc)
                #sentences += doc_to_sentences(doc, tokenizer, remove_stopwords=False)
    
    graph = nx.Graph()
    weights = Counter()
    edges = set()
    
    if group is "":
        coll = list(corp[""])
        window = coll[0:5]
        
        for tup in it.permutations(window,2):
            weights[tup] += 1
            weights[(tup[1],tup[0])] += 1
            edges.add(tup)

        for i in range(2,len(coll)-2):
            for j in range(i-2,i+2):
                weights[(coll[j],coll[i+2])] += 1
                weights[(coll[i+2],coll[j])] += 1
                edges.add((coll[i+2],coll[j]))
                
        for e in edges:
            graph.add_edge(e[0], e[1], {'weight':weights[e]})
    else:
        for (g,coll) in corp.iteritems():
            coll = list(coll)
            window = coll[0:5]
            
            """
            This code adds edges from one token
            to every other token. Need to remove
            for tup in it.permutations(window,2):
                weights[tup] += 1
                weights[(tup[1],tup[0])] += 1
                edges.add(tup)
            """
            for t in window:
                if not (t is g):
                    weights[(g,t)] += 1
                    weights[(t,g)] += 1
                    edges.add((g,t))
                
            for i in range(2,len(coll)-2):
                for j in range(i-2,i+2):
                    if not (coll[i+2] is g):
                        weights[(g,coll[i+2])] += 1
                        weights[(coll[i+2],g)] += 1
                        edges.add((g,coll[i+2]))
                    
                    """
                    This code also adds token to token edges
                    weights[(coll[j],coll[i+2])] += 1
                    weights[(coll[i+2],coll[j])] += 1
                    edges.add((coll[i+2],coll[j]))
                    """
            for e in edges:
                # Bimax and biclique detection techniques do not use weights, so just
                # create threshed edges
                if weights[e] > threshold:
                    graph.add_edge(e[0], e[1])

    #nx.write_weighted_edgelist(graph, output+"_b.g")
    #print nx.to_numpy_matrix(graph)
    #np.savetxt(output+"_b.adj", nx.to_numpy_matrix(graph))
    generate_graph_statistics(graph,output+"_b")
    print "finished"

示例#6

0

显示文件

文件： corpusCreationAndEmbedding.py 项目： mehtakash93/Research_Work

def generate_token_group_bigraph_from_amazon_imdb_dataset(output="graph", threshold=5): 
	#This function is used to create a graph from the imdb/amazon dataset 
	#and store it a file called edgelist.graph in the folder in which the code is being run.
    
    corp = []
    
    input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ]
    
    for folder in input_folders:
        dir_path = dataset_folder + os.sep + folder + os.sep
        files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
        
        for file in files:
            file_path = dir_path + file
            file_name, file_extension = splitext(file_path)
            doc = ""
            
            if file_extension == ".txt":
                with open(file_path) as f:
                    for line in f:
                        doc = doc+" "+line
            else:
                continue
                
            if doc != "":
                doc = doc.decode("utf8")
                doc = doc.lower()
                doc = doc_to_wordlist(doc,True)
                corp = it.chain(corp,doc)
    
    graph = nx.Graph()
    weights = Counter()
    edges = set()
    
    
    coll = list(corp)
    window = coll[0:5]
    
    for tup in it.permutations(window,2):
        weights[tup] += 1
        weights[(tup[1],tup[0])] += 1
        edges.add(tup)

    for i in range(2,len(coll)-2):
        for j in range(i-2,i+2):
            weights[(coll[j],coll[i+2])] += 1
            weights[(coll[i+2],coll[j])] += 1
            edges.add((coll[i+2],coll[j]))
            
    for e in edges:
        graph.add_edge(e[0], e[1], {'weight':weights[e]})
    

    #nx.write_weighted_edgelist(graph, output+"_b.g")
    #print nx.to_numpy_matrix(graph)
    #np.savetxt(output+"_b.adj", nx.to_numpy_matrix(graph))

    nx.write_edgelist(graph, "edgelist.graph", data=['weight'])
    
    print "finished"
    

    
#generate_token_group_bigraph_from_amazon_imdb_dataset(group="",output="agraph")
#generate_graph_statistics_from_file("edgelist.graph")

#parsereviewfile()

示例#7

0

显示文件

文件： getEdgeListandDictFromText.py 项目： mehtakash93/Research_Work

def generate_token_group_bigraph_from_amazon_imdb_dataset(output="graph", threshold=5): 
	#This function is used to create a graph from the imdb/amazon dataset 
	#and store it a file called edgelist.graph in the folder in which the code is being run.
    
    corp = []
    
    input_folders = [ sub_dir for sub_dir in listdir(dataset_folder) if isdir(join(dataset_folder, sub_dir)) ]
    
    for folder in input_folders:
        dir_path = dataset_folder + os.sep + folder + os.sep
        files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ]
        print "Looking inside: " + dir_path
        
        for file in files:
            file_path = dir_path + file
            file_name, file_extension = splitext(file_path)
            print 'Scanning File: ', file_path
            doc = ""
            
            if file_extension == ".txt":
                with open(file_path) as f:
                    print 'Copying data from: ', file_path
                    for line in f:
                        doc = doc+" "+line
            else:
                continue
                
            if doc != "":
                #doc = doc.decode("utf8")
                doc = doc.lower()
                doc = doc_to_wordlist(doc,True)
                print 'Words retrived: ', doc
                corp = it.chain(corp,doc)
    
    graph = nx.Graph()
    weights = Counter()
    edges = list()
    
    
    coll = list(corp)
    wordList = list(set(coll))
    wordDict = {}
    revWordDict = {}

    for i in range(len(wordList)):
        wordDict['text'+str(i)]=wordList[i];
        revWordDict[wordList[i]]='text'+str(i);
	
    window = coll[0:5]
    
    for tup in it.permutations(window,2):
        weights[tup] += 1
        weights[(tup[1],tup[0])] += 1
        print 'Trying to add edge: ', [revWordDict[tup[0]],revWordDict[tup[1]]]
        edges.append([revWordDict[tup[0]],revWordDict[tup[1]]])

    for i in range(2,len(coll)-2):
        for j in range(i-2,i+2):
            edges.append([revWordDict[coll[i+2]],revWordDict[coll[j]]])
            
    for e in edges:
        print 'Trying to add edge: ', e[0],e[1]
        graph.add_edge(e[0], e[1])
    

    #nx.write_weighted_edgelist(graph, output+"_b.g")
    #print nx.to_numpy_matrix(graph)
    #np.savetxt(output+"_b.adj", nx.to_numpy_matrix(graph))

    nx.write_edgelist(graph, "corpusEdgelist.graph")
    g = open('index2word.pkl', 'w')
    cPickle.dump(wordDict, g, -1)
    g.close()
    g = open('word2index.pkl', 'w')
    cPickle.dump(revWordDict, g, -1)
    g.close()

    print "finished"