Exemplo n.º 1
0
def extract_text(output_dir):

    corpus_input = {}
    plugin_directory = os.path.abspath(os.path.dirname(__file__))
    questions = pandas.read_csv("%s/cnp_739.tsv" % (plugin_directory), sep="\t")
    for question in questions.iterrows():
        corpus_input[question[1].question_label] = {
            "text": question[1].question_text,
            "labels": [question[1].question_label],
        }

    # Save articles to text files in output folder
    save_sentences(corpus_input, output_dir=output_dir)
Exemplo n.º 2
0
def extract_text(uids,output_dir):

    # This function will be called by a job, and must call save_sentences
    # ** ALL USER FUNCTIONS MUST HAVE output_dir as an input

    # You can provide a dictionary if your items have ids
    corpus_input = {
                     "uid1":{"text":"One fish, two fish."},
                     "uid2":{"text":"Nerd fish, wordfish."},
                    }

    # If you have labels (to be used for classification
    corpus_input = {
                     "uid1":{"text":"One fish, two fish.","labels":["counting","poem"]},
                     "uid2":{"text":"Nerd fish, wordfish.","labels":["poem"]},
                    }

    # Or just give raw text in a list
    corpus_input =   ["This is text from article 1 with no id.",
                      "This is text from article 2 with no id."]


    # Save articles to individual text files in output folder     
    save_sentences(corpus_input,output_dir=output_dir)
Exemplo n.º 3
0
    features = pandas.read_csv(f,sep="\t")  

    # Prepare dictionary with key [pmid] and value [text]
    features.index = features.pmid
    features = features.drop("pmid",axis=1)
    corpus_input = dict()
    for pmid,article in articles.iteritems():
        # Label the article with nonzero values
        try:
            labels = features.columns[features.loc[int(pmid)]!=0].tolist()     
            corpus_input[pmid] = {"text":article.getAbstract(),"labels":labels}
        except:
            pass

    # Save articles to text files in output folder     
    save_sentences(corpus_input,output_dir=output_dir)

def extract_terms(output_dir):
    f,d = download_data()
    features = pandas.read_csv(f,sep="\t")  
    terms = features.columns.tolist()
    terms.pop(0)  #pmid
    save_terms(terms,output_dir)
    

def generate_maps(terms,output_dir):

    f,d = download_data()
    features = pandas.read_csv(f,sep="\t")  
    database = pandas.read_csv(d,sep="\t")