Пример #1
0
def main():
    # get the the path for the input file argument
    parser = argparse.ArgumentParser()
    parser._action_groups.pop()
    required = parser.add_argument_group('required arguments')
    optional = parser.add_argument_group('optional arguments')
    required.add_argument('-tr', '--train_image', help='image used for training the algorithm', required=True)
    required.add_argument('-te', '--test_image', help='image to evaluate', required=True)
    optional.add_argument('-l', '--log', dest="logLevel", choices=['DEBUG', 'debug', 'INFO', 'info', 'ERROR', 'error'],
                          help='Argument use to set the logging level')
    optional.add_argument('-knn', '--knn', help='flag to run knn', action='store_true')

    args = parser.parse_args()

    logger_initialization(log_level=args.logLevel)

    logging.getLogger('regular.time').info('starting running handwritten-notes script')

    digits, y_train = load_digits(args.train_image)

    x_train = pixels_to_hog_20(digits)

    num_pixels = x_train.shape[1]
    num_classes = len(np.unique(y_train))

    if args.knn:
        logging.getLogger('regular.time').info('training knn model')
        model = KNeighborsClassifier()
        model.fit(x_train, y_train)
    else:
        logging.getLogger('regular.time').info('training NN model')
        model = Sequential()
        model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation='relu'))
        model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))   
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    process_test_image(dataset=args.test_image, model=model, model_type=args.knn)
Пример #2
0
def main():
    # get the the path for the input file argument
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', help='dataset file that has not being processed')
    parser.add_argument('-tr', '--train_file', help='processed training dataset file')
    parser.add_argument('-te', '--test_file', help='processed testing dataset file')
    parser.add_argument("-l", "--log", dest="logLevel", choices=['DEBUG', 'INFO', 'ERROR'], type=str.upper,
                        help="Set the logging level")
    args = parser.parse_args()

    logger_initialization(log_level=args.logLevel)

    training_dir = args.train_file
    testing_dir = args.test_file

    # if those variables are not pass, then populate them
    if not (training_dir or testing_dir):
        training_dir = 'datasets/train_data_processed.csv'
        testing_dir = 'datasets/test_data_processed.csv'
    else:
        training_dir = 'datasets/' + training_dir + '.csv'
        testing_dir = 'datasets/' + testing_dir + '.csv'

    load(train_directory=training_dir, test_directory=testing_dir)

    logging.getLogger('regular.time').info('starting running pre-processing script')

    # import data from file
    dataset = load_data(args.input_file)
    # calculate relevant variables' values
    process_dataset(dataset=dataset, train_dir=training_dir, test_dir=testing_dir)

    # save it
    store_dataset()

    logging.getLogger('regular.time').info('finished running pre-processing script')
Пример #3
0
def main():
    """
    starts running the script
    :return: None.
    """

    # get the the path for the input file argument
    parser = argparse.ArgumentParser()
    parser.add_argument('-r',
                        '--retrieve',
                        help='arg use to pull data from PubMed',
                        action='store_true')
    parser.add_argument(
        '-p',
        '--process',
        help='arg use to process the info into paper, author, medical and '
        'title_abstracts records',
        action='store_true')
    parser.add_argument('-a',
                        '--analyze',
                        help='run topic modeling on the file',
                        action='store_true')
    parser.add_argument(
        '-f',
        '--file',
        help=
        'file to process. Depending on whether the retrieve, process or analysis '
        'options were selected, there is a different default file')
    parser.add_argument('-l',
                        '--log',
                        dest='logLevel',
                        choices=['DEBUG', 'INFO', 'ERROR'],
                        type=str.upper,
                        help='Set the logging level')

    if sys.platform == "darwin" or sys.platform == "win32":
        if sys.platform == "win32":
            path = 'D:\dataset\scosy\dataset'
        else:
            path = '/Volumes/dataset/scosy/dataset'
    # Respublica
    else:
        path = 'dataset/'

    args = parser.parse_args()
    logger_initialization(log_level=args.logLevel)
    logging.getLogger('line.regular.time.line').info('Running SCOSY')

    if args.retrieve:

        logging.getLogger('regular').info('retrieving data from PudMed')

        # databases such as PubMed, GenBank, GEO, and many others
        # Use the mandatory email parameter so the NCBI can contact you if there is a proble
        Entrez.email = "*****@*****.**"  # Always tell NCBI who you are
        logging.getLogger('regular').info(
            'searching PubMed for CHOP and UPENN authors')
        handle = Entrez.esearch(
            db="pubmed",
            retmax=100000000,
            idtype="esearch",
            mindate="2014/01/01",
            maxdate="2020/08/21",
            term=
            "Perelman School of Medicine[Affiliation] OR Children's Hospital of "
            "Philadelphia[Affiliation] OR University of Pennsylvania School of "
            "Medicine[Affiliation] OR School of Medicine University of "
            "Pennsylvania[Affiliation]",
            usehistory="y")
        search_results = Entrez.read(handle)
        handle.close()
        # obtaining the list of relevant PMIDs
        id_list = search_results["IdList"]

        # get all the record based on the PMIDs
        # logging.getLogger('regular.time').info('getting relevant authors\' records based on PMIDs')
        fetch_records_handle = Entrez.efetch(db="pubmed",
                                             id=id_list,
                                             retmode="text",
                                             rettype="medline")
        # need to read all the data from the handle and store in a file because if we just read line by line from the
        # generator and the internet connection is not strong, then we run into http errors:
        # http.client.IncompleteRead: IncompleteRead(0 bytes read)
        result_path = Path(path, 'results.txt')
        out_handle = result_path.open('w+')
        out_handle.write(fetch_records_handle.read())
        # the results are now in the results.xml file and the original handle has had all of its data extracted
        # (so we close it)
        out_handle.close()
        msg = 'saved authors\' records on local file = {0}'.format(result_path)
        logging.getLogger('regular.time').info(msg)

    elif args.process:

        # import data from file
        logging.getLogger('regular').info('reading data from result file')

        file_name = args.file
        if not file_name:
            file_name = 'results.txt'

        result_path = Path(path, file_name)
        records_handle = result_path.open()
        fetch_records = parse(handle=records_handle)

        # initializing variables
        mesh_description_dict = obtain_descriptions()

        # contains all the metadata elements on the author level: PubMed unique Identifier number(PMID), AuthorID (as a
        # (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation
        author_record_df = pd.DataFrame(columns=[
            'PMID', 'Author', 'author_chop', 'author_penn', 'Role',
            'AffiliationInfo'
        ])
        # contains all the metadata elements on the paper level: PubMed unique Identifier number(PMID), Title, Abstract,
        # Year, Month, AuthorList, SubjectList, date
        paper_record_df = pd.DataFrame(columns=[
            'PMID', 'Title', 'Abstract', 'Year', 'Month', 'author_list',
            'subject_list', 'date'
        ])
        # contains all the metadata of the medical information: PubMed unique Identifier number(PMID), Primary Medical
        # Subject Header (MESH) and the description ID
        medical_record_df = pd.DataFrame(
            columns=['PMID', 'Desc', 'Primary_MeSH'])

        title_list = list()
        abstract_list = list()

        # get the relevant information for each record
        for record_index, record in enumerate(fetch_records):

            logging.getLogger('regular').debug(
                'record index = {0}'.format(record_index))

            try:
                pmid = record.get('PMID')
                title = record.get('TI')
                abstract = record.get('AB')
                authors = record.get('FAU')
                affiliations = record.get('AD')
                publication_type = record.get('PT')
                mesh_term = record.get('MH')
                date_created = record.get('EDAT')
                year, month = date_created.split('/')[:2]
                date = year + '/' + month

                logging.getLogger('regular').debug('pmid = {0}'.format(pmid))
                logging.getLogger('regular').debug('title = {0}'.format(title))
                logging.getLogger('regular').debug(
                    'abstract = {0}'.format(abstract))
                logging.getLogger('regular').debug(
                    'authors = {0}'.format(authors))
                logging.getLogger('regular').debug(
                    'affiliations = {0}'.format(affiliations))
                logging.getLogger('regular').debug(
                    'publication type = {0}'.format(publication_type))
                logging.getLogger('regular').debug(
                    'mesh term = {0}'.format(mesh_term))
                logging.getLogger('regular').debug(
                    'data created = {0}'.format(date_created))

                # assign the chief author, ordinary author or principal investigator role to each author
                roles = assign_roles(authors)
                # check and assign whether the authors belong to the CHOP or PENN organization
                chop_organization, penn_organization = assign_organization(
                    affiliations)

                mesh_description = ''
                if mesh_term is None:
                    mesh_term = ''
                else:
                    mesh_description, term = convert_mesh_description(
                        mesh_description_dict, mesh_term)
                    mesh_term = ';'.join(mesh_term)

                # output information
                if mesh_description:
                    row = pd.DataFrame(
                        [[pmid, term, mesh_description]],
                        columns=['PMID', 'Primary_MeSH', 'Desc'])
                    medical_record_df = medical_record_df.append(
                        row, ignore_index=True)

                for author_index, organizations in enumerate(
                        zip(chop_organization, penn_organization)):
                    # check if the author belongs to either CHOP or PENN
                    if 1 in organizations:
                        row = pd.DataFrame([[
                            pmid, authors[author_index], organizations[0],
                            organizations[1], roles[author_index],
                            affiliations[author_index]
                        ]],
                                           columns=[
                                               'PMID', 'Author', 'author_chop',
                                               'author_penn', 'Role',
                                               'AffiliationInfo'
                                           ])
                        author_record_df = author_record_df.append(
                            row, ignore_index=True)

                authors = ';'.join(authors)

                row = pd.DataFrame([[
                    pmid, title, abstract, year, month, authors, mesh_term,
                    date
                ]],
                                   columns=[
                                       'PMID', 'Title', 'Abstract', 'Year',
                                       'Month', 'author_list', 'subject_list',
                                       'date'
                                   ])
                paper_record_df = paper_record_df.append(row)

                title_list.append(title)
                abstract_list.append(abstract)

            except Exception as e:
                msg = 'Error while processing PMID={0}'.format(pmid)
                logging.getLogger('regular').debug(msg)
                msg = 'Exception message = {0}'.format(e)
                logging.getLogger('regular').debug(msg)

        # contains all the metadata elements on the author level: Pubmed unique Identifier number(PMID), AuthorID (as a
        # (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation
        author_path = Path(path, 'author_record.csv')
        author_record_df.to_csv(author_path, index=False)
        # contains all the metadata elements on the paper level: Pubmed unique Identifier number(PMID), Title, Abstract,
        # Year, Month, AuthorList, SubjectList, date
        paper_path = Path(path, 'paper_record.csv')
        paper_record_df.to_csv(paper_path, index=False)
        # contains all the metadata of the medical information: Pubmed unique Identifier number(PMID), Primary Medical
        # Subject Header (MESH) and the description ID
        medical_path = Path(path, 'medical_record.csv')
        medical_record_df.to_csv(medical_path, index=False)

        # store the record in a file for processing
        dataset = dict()
        dataset['title'] = title_list
        dataset['abstracts'] = abstract_list
        dataset['mesh'] = mesh_term
        dataset = pd.DataFrame(dataset)
        titles_abstracts_mesh_path = Path(path, 'titles_abstracts_mesh.csv')
        dataset.to_csv(path_or_buf=titles_abstracts_mesh_path, index=False)

    logging.getLogger('line.regular.time.line').info(
        'SCOSY finished running successfully.')
Пример #4
0
def main():
    # get the the path for the input file argument
    parser = argparse.ArgumentParser()
    parser.add_argument("-l",
                        "--log",
                        dest="logLevel",
                        choices=['DEBUG', 'INFO', 'ERROR'],
                        type=str.upper,
                        help="Set the logging level")
    args = parser.parse_args()

    logger_initialization(log_level=args.logLevel)

    logging.getLogger('line.regular.time.line').info(
        'Running Recommendation System script')

    # import data from file
    logging.getLogger('regular').info('reading data from file')

    # databases such as PubMed, GenBank, GEO, and many others
    # Use the mandatory email parameter so the NCBI can contact you if there is a proble
    Entrez.email = "*****@*****.**"  # Always tell NCBI who you are
    logging.getLogger('regular').info(
        'searching pubmed for the CHOP and UPENN authors')
    handle = Entrez.esearch(
        db="pubmed",
        retmax=50000,
        idtype="esearch",
        mindate="2014/01/01",
        maxdate="2017/05/01",
        term=
        "Perelman School of Medicine[Affiliation] OR Children's Hospital of "
        "Philadelphia[Affiliation] OR University of Pennsylvania School of "
        "Medicine[Affiliation] OR School of Medicine University of Pennsylvania[Affiliation]",
        usehistory="y")
    search_results = Entrez.read(handle)
    handle.close()
    # obtaining the list of relevant PMIDs
    id_list = search_results["IdList"]

    # get all the record based on the PMIDs
    logging.getLogger('regular').info(
        'getting relevant authors\' records based on PMIDs')
    fetch_records_handle = Entrez.efetch(db="pubmed",
                                         id=id_list,
                                         retmode="text",
                                         rettype="medline")
    # need to read all the data from the handle and store in a file because if we just read line by line from the
    # generator and the internet connection is not strong, then we run into http errors:
    # http.client.IncompleteRead: IncompleteRead(0 bytes read)
    logging.getLogger('regular').info(
        'storing authors\' records on local file')
    with open("results.xml", "w") as out_handle:
        out_handle.write(fetch_records_handle.read())
    # the results are now in the results.xml file and the original handle has had all of its data extracted
    # (so we close it)
    fetch_records_handle.close()

    logging.getLogger('regular').info('reading result files')
    records_handle = open("results.xml")
    fetch_records = parse(handle=records_handle)

    # initializing variables
    mesh_description_dict = obtain_descriptions()

    # contains all the metadata elements on the author level: Pubmed unique Identifier number(PMID), AuthorID (as a
    # (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation
    author_record_df = pd.DataFrame(columns=[
        'PMID', 'AuthorID', 'Author CHOP', 'Author PENN', 'ROLE', 'Affiliation'
    ])
    # contains all the metadata elements on the paper level: Pubmed unique Identifier number(PMID), Title, Abstract,
    # Year, Month, AuthorList, SubjectList, date
    paper_record_df = pd.DataFrame(columns=[
        'PMID', 'Title', 'Abstract', 'Year', 'Month', 'Author List',
        'Subject List', 'Date'
    ])
    # contains all the metadata of the medical information: Pubmed unique Identifier number(PMID), Primary Medical
    # Subject Header (MESH) and the description ID
    medical_record_df = pd.DataFrame(columns=['PMID', 'MESH', 'Description'])

    title_list = list()
    abstract_list = list()

    # get the relevant information for each record
    for record_index, record in enumerate(fetch_records):

        logging.getLogger('regular').debug(
            'record index = {0}'.format(record_index))

        try:
            pmid = record.get('PMID')
            title = record.get('TI')
            abstract = record.get('AB')
            authors = record.get('FAU')
            affiliations = record.get('AD')
            publication_type = record.get('PT')
            mesh_term = record.get('MH')
            date_created = record.get('EDAT')
            year, month = date_created.split('/')[:2]
            date = year + '/' + month

            logging.getLogger('regular').debug('pmid = {0}'.format(pmid))
            logging.getLogger('regular').debug('title = {0}'.format(title))
            logging.getLogger('regular').debug(
                'abstract = {0}'.format(abstract))
            logging.getLogger('regular').debug('authors = {0}'.format(authors))
            logging.getLogger('regular').debug(
                'affiliations = {0}'.format(affiliations))
            logging.getLogger('regular').debug(
                'publication type = {0}'.format(publication_type))
            logging.getLogger('regular').debug(
                'mesh term = {0}'.format(mesh_term))
            logging.getLogger('regular').debug(
                'data created = {0}'.format(date_created))

            # assign the chief author, ordinary author or principal investigator role to each author
            roles = assign_roles(authors)
            # check and assign whether the authors belong to the CHOP or PENN organization
            chop_organization, penn_organization = assign_organization(
                affiliations)

            mesh_description = ''
            if mesh_term is None:
                mesh_term = ''
            else:
                term, mesh_description = convert_mesh_description(
                    mesh_description_dict, mesh_term)
                mesh_term = ';'.join(mesh_term)

            # output information
            if mesh_description:
                row = pd.DataFrame([[pmid, term, mesh_description]],
                                   columns=['PMID', 'Mesh', 'Description'])
                medical_record_df = medical_record_df.append(row,
                                                             ignore_index=True)

            for author_index, organizations in enumerate(
                    zip(chop_organization, penn_organization)):
                if 1 in organizations:
                    row = pd.DataFrame([[
                        pmid, authors[author_index], organizations[0],
                        organizations[1], roles[author_index],
                        affiliations[author_index]
                    ]],
                                       columns=[
                                           'PMID', 'AuthorID', 'Author CHOP',
                                           'Author PENN', 'ROLE', 'Affiliation'
                                       ])
                    author_record_df = author_record_df.append(
                        row, ignore_index=True)

            authors = ';'.join(authors)

            row = pd.DataFrame([[
                pmid, title, abstract, year, month, authors, mesh_term, date
            ]],
                               columns=[
                                   'PMID', 'Title', 'Abstract', 'Year',
                                   'Month', 'Author List', 'Subject List',
                                   'Date'
                               ])
            paper_record_df = paper_record_df.append(row)

            title_list.append(title)
            abstract_list.append(abstract)

        except Exception as e:
            msg = 'Error while processing PMID={0}'.format(pmid)
            logging.getLogger('regular').debug(msg)
            msg = 'Exception message = {0}'.format(e)
            logging.getLogger('regular').debug(msg)

    # store the record in a file for processing
    dataset = dict()
    dataset['title'] = title_list
    dataset['abstracts'] = abstract_list
    dataset = pd.DataFrame(dataset)
    dataset.to_csv(path_or_buf='record_results/titles_abstracts.csv',
                   index=False)

    # read the records from the file
    # dataset = pd.read_csv('record_results/titles_abstracts.csv')

    # topic_modeling(dataset=dataset)

    pandas.io.formats.excel.header_style = None
    # contains all the metadata elements on the author level: Pubmed unique Identifier number(PMID), AuthorID (as a
    # (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation
    author_record_df.to_excel('record_results/author_record.xlsx',
                              sheet_name='author_record',
                              index=False)
    # contains all the metadata elements on the paper level: Pubmed unique Identifier number(PMID), Title, Abstract,
    # Year, Month, AuthorList, SubjectList, date
    paper_record_df.to_excel('record_results/paper_record.xlsx',
                             sheet_name='paper_record',
                             index=False)
    # contains all the metadata of the medical information: Pubmed unique Identifier number(PMID), Primary Medical
    # Subject Header (MESH) and the description ID
    medical_record_df.to_excel('record_results/medical_record.xlsx',
                               sheet_name='medical_record',
                               index=False)

    logging.getLogger('line.regular.time.line').info(
        'Recommendation System script finished running successfully.')
Пример #5
0
def main():
    # ignore warning of compiling tensorflow from source
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

    # get the the path for the input file argument
    parser = argparse.ArgumentParser()
    parser.add_argument('-i',
                        '--input_file',
                        help='dataset file that has not being processed')
    parser.add_argument('-tr',
                        '--train_file',
                        help='processed training dataset file')
    parser.add_argument('-te',
                        '--test_file',
                        help='processed testing dataset file')
    parser.add_argument("-l",
                        "--log",
                        dest="logLevel",
                        choices=['DEBUG', 'INFO', 'ERROR'],
                        type=str.upper,
                        help="Set the logging level")
    parser.add_argument('-cv', '--cross_validation', action='store_true')
    parser.add_argument('-gs', '--grid_search', action='store_true')
    parser.add_argument('-svm',
                        '--svm',
                        help='run support vector machine',
                        action='store_true')
    parser.add_argument('-p',
                        '--processed_dataset',
                        action='store_true',
                        help='this flag is used when the training '
                        'and testing datasets are provided')
    parser.add_argument('-s',
                        '--store_datasets',
                        action='store_true',
                        help='this flag is used to store the training'
                        'and testing dataset on local system')
    args = parser.parse_args()

    logger_initialization(log_level=args.logLevel)

    logging.getLogger('line.regular.time.line').info('Running No_Show script')

    # import data from file
    logging.getLogger('regular').info('reading data from file')

    tr_data = pd.read_csv(filepath_or_buffer=args.train_file, delimiter='|')
    te_data = pd.read_csv(filepath_or_buffer=args.test_file, delimiter='|')

    logging.getLogger('regular').debug('training dataset shape = {0}'.format(
        tr_data.shape))
    logging.getLogger('regular').debug('training dataset keys = {0}'.format(
        tr_data.keys()))
    logging.getLogger('regular').debug('testing dataset shape = {0}'.format(
        te_data.shape))
    logging.getLogger('regular').debug('testing dataset keys = {0}'.format(
        te_data.keys()))

    y_train_data = tr_data['NOSHOW'].values
    y_test_data = te_data['NOSHOW'].values
    x_train_data = tr_data.drop([
        'PATIENT_KEY', 'ENCOUNTER_APPOINTMENT_DATETIME',
        'ENCOUNTER_APPOINTMENT_STATUS', 'NOSHOW'
    ],
                                axis=1).values
    x_test_data = te_data.drop([
        'PATIENT_KEY', 'ENCOUNTER_APPOINTMENT_DATETIME',
        'ENCOUNTER_APPOINTMENT_STATUS', 'NOSHOW'
    ],
                               axis=1).values

    # check if cross validation flag is set
    run_model(training_data=x_train_data,
              testing_data=x_test_data,
              training_y=y_train_data,
              testing_y=y_test_data,
              svm_flag=args.svm,
              gs_flag=args.grid_search)
Пример #6
0
def main():
    # get the the path for the input file argument
    parser = argparse.ArgumentParser()
    parser.add_argument("-l",
                        "--log",
                        dest="logLevel",
                        choices=['DEBUG', 'INFO', 'ERROR'],
                        type=str.upper,
                        help="Set the logging level")
    args = parser.parse_args()

    logger_initialization(log_level=args.logLevel)

    logging.getLogger('line.regular.time.line').info(
        'Running Recommendation System script')

    # import data from file
    logging.getLogger('regular').info('reading data from file')

    # Entrez (http://www.ncbi.nlm.nih.gov/Entrez) is a data retrieval system that provides users access to NCBI’s
    # databases such as PubMed, GenBank, GEO, and many others
    # Use the mandatory email parameter so the NCBI can contact you if there is a proble
    Entrez.email = "*****@*****.**"  # Always tell NCBI who you are
    # logging.getLogger('regular').info('searching pubmed for the CHOP and UPENN authors')
    # handle = Entrez.esearch(db="pubmed", retmax=50000, idtype="esearch", mindate="2014/01/01", maxdate="2017/05/01",
    #                         term="Perelman School of Medicine[Affiliation] OR Children's Hospital of "
    #                              "Philadelphia[Affiliation] OR University of Pennsylvania School of "
    #                              "Medicine[Affiliation] OR School of Medicine University of Pennsylvania[Affiliation]",
    #                         usehistory="y")
    # search_results = Entrez.read(handle)
    # handle.close()
    # # obtaining the list of relevant PMIDs
    # id_list = search_results["IdList"]
    #
    # # get all the record based on the PMIDs
    # logging.getLogger('regular').info('getting relevant authors\' records based on PMIDs')
    # fetch_records_handle = Entrez.efetch(db="pubmed", id=id_list, retmode="text", rettype="medline")
    # # need to read all the data from the handle and store in a file because if we just read line by line from the
    # # generator and the internet connection is not strong, then we run into http errors:
    # # http.client.IncompleteRead: IncompleteRead(0 bytes read)
    # logging.getLogger('regular').info('storing authors\' records on local file')
    # with open("results.xml", "w") as out_handle:
    #     out_handle.write(fetch_records_handle.read(validate=True))
    # # the results are now in the results.xml file and the original handle has had all of its data extracted
    # # (so we close it)
    # fetch_records_handle.close()

    logging.getLogger('regular').info('reading result files')
    records_handle = open("results.xml")
    fetch_records = parse(records_handle)

    # initializing variables
    mesh_description_dict = obtain_descriptions()

    # PMID=PubMed Unique Identifier, TI=Title, AB=Abstract, AD=Affiliation, FAU=Full Author, MH=MeSH Terms,
    # PT=Publication Type
    # for more information, look at the abbreviations in the /template/abbreviations.txt file
    author_information = {
        'PMID': '',
        'TI': '',
        'AB': '',
        'FAU': '',
        'AU': '',
        'MH': '',
        'PT': '',
        'AD': ''
    }

    author_list = list()
    affiliation_list = list()
    mesh_list = list()

    first_record = True

    # get the relevant information for each record
    for record_index, line in enumerate(fetch_records):
        logging.getLogger('regular').debug(
            'line index = {0}'.format(record_index))

        # remove new line delimiter
        line = line.replace('\n', '')

        # skip if empty string
        if not line:
            continue

        # getting the key (PMID, TITLE, ABSTRACT, etc) and its value
        key, value = line.split('- ')
        # remove spaces
        key.replace(' ', '')

        # check if key is relevant to the information of interest
        if key not in author_information.keys():
            continue

        if key == 'PMID':
            # if it is not the first record, that means that it is a new record and therefore needs to reset all the
            # variables
            if not first_record:
                author_information['AU'] = author_list
                author_information['AD'] = affiliation_list
                author_information['MH'] = mesh_list

                logging.getLogger('regular').debug(
                    'authors\' information = {0}'.format(author_information))

                # function to print's the author's information to the relevant files
                # output_author_information(author_information)

                author_information = dict['PMID':'', 'TI':'', 'AB':'',
                                          'FAU':'', 'AU', 'ROLE':'', 'MH':'',
                                          'PT':'', 'AD':'']

                author_list = list()
                affiliation_list = list()

        # there might be multiple authors per PMID and therefore we need to add them to a list
        if key == 'FAU':
            author_list.append(value)
        # each author might have one or more affiliations
        elif key == 'AD':
            affiliation_list.append(value)
        # there might be multiple mesh terms
        elif key == 'MH':
            # some of the mesh terms might have an * that needs to be removed
            mesh_list.append(value.replace('*', ''))

        # add the authors' information
        author_information[key] = value

        # changing first record flag
        first_record = False

    logging.getLogger('line.regular.time.line').info(
        'Recommendation System script finished running successfully.')
Пример #7
0
def main():
    # get the the path for the input file argument
    parser = argparse.ArgumentParser()
    parser.add_argument("-l", "--log", dest="logLevel", choices=['DEBUG', 'INFO', 'ERROR'], type=str.upper,
                        help="Set the logging level")
    args = parser.parse_args()

    logger_initialization(log_level=args.logLevel)

    logging.getLogger('line.regular.time.line').info('Running Recommendation System script')

    # import data from file
    logging.getLogger('regular').info('reading data from file')

    # Entrez (http://www.ncbi.nlm.nih.gov/Entrez) is a data retrieval system that provides users access to NCBI’s
    # databases such as PubMed, GenBank, GEO, and many others
    # Use the mandatory email parameter so the NCBI can contact you if there is a proble
    Entrez.email = "*****@*****.**"     # Always tell NCBI who you are
    # logging.getLogger('regular').info('searching pubmed for the CHOP and UPENN authors')
    # handle = Entrez.esearch(db="pubmed", retmax=50000, idtype="esearch", mindate="2014/01/01", maxdate="2017/05/01",
    #                         term="Perelman School of Medicine[Affiliation] OR Children's Hospital of "
    #                              "Philadelphia[Affiliation] OR University of Pennsylvania School of "
    #                              "Medicine[Affiliation] OR School of Medicine University of Pennsylvania[Affiliation]",
    #                         usehistory="y")
    # search_results = Entrez.read(handle)
    # handle.close()
    # # obtaining the list of relevant PMIDs
    # id_list = search_results["IdList"]
    #
    # # get all the record based on the PMIDs
    # logging.getLogger('regular').info('getting relevant authors\' records based on PMIDs')
    # fetch_records_handle = Entrez.efetch(db="pubmed", id=id_list, retmode="text", rettype="medline")
    # # need to read all the data from the handle and store in a file because if we just read line by line from the
    # # generator and the internet connection is not strong, then we run into http errors:
    # # http.client.IncompleteRead: IncompleteRead(0 bytes read)
    # logging.getLogger('regular').info('storing authors\' records on local file')
    # with open("results.xml", "w") as out_handle:
    #     out_handle.write(fetch_records_handle.read(validate=True))
    # # the results are now in the results.xml file and the original handle has had all of its data extracted
    # # (so we close it)
    # fetch_records_handle.close()

    logging.getLogger('regular').info('reading result files')
    records_handle = open("results.xml")
    fetch_records = parse(handle=records_handle)

    # initializing variables
    mesh_description_dict = obtain_descriptions()

    # contains all the metadata of the medical information: Pubmed unique Identifier number(PMID), Primary Medical
    # Subject Header (MESH) and the description ID
    medical_record_file = open('record_results/medical_record.csv', 'w')
    medical_record_file.write('PMID,Primary MeSH,Description\n')
    # contains all the metadata elements on the author level: Pubmed unique Identifier number(PMID), AuthorID (as a
    # combination of the author’s last name, first name, and initials), institution: chop=0, Penn=1, Role: Chief Author
    # (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation
    author_record_file = open('record_results/author_record.csv', 'w')
    author_record_file.write('PMID,Author,Author_CHOP,Author_PENN,Role,Affiliation\n')
    # contains all the metadata elements on the paper level: Pubmed unique Identifier number(PMID), Title, Abstract,
    # Year, Month, AuthorList, SubjectList, date
    paper_record_file = open('record_results/paper_record.csv', 'w')
    paper_record_file.write('PMID,Title,Abstract,Year,Month,Author List,Subject List,Date\n')

    # get the relevant information for each record
    for record_index, record in enumerate(fetch_records):

        logging.getLogger('regular').debug('record index = {0}'.format(record_index))

        pmid = record.get('PMID')
        title = record.get('TI')
        abstract = record.get('AB')
        authors = record.get('FAU')
        affiliations = record.get('AD')
        publication_type = record.get('PT')
        mesh_term = record.get('MH')
        date_created = record.get('EDAT')
        year, month = date_created.split('/')[:2]
        date = year + '/' + month

        logging.getLogger('regular').debug('pmid = {0}'.format(pmid))
        logging.getLogger('regular').debug('title = {0}'.format(title))
        logging.getLogger('regular').debug('abstract = {0}'.format(abstract))
        logging.getLogger('regular').debug('authors = {0}'.format(authors))
        logging.getLogger('regular').debug('affiliations = {0}'.format(affiliations))
        logging.getLogger('regular').debug('publication type = {0}'.format(publication_type))
        logging.getLogger('regular').debug('mesh term = {0}'.format(mesh_term))
        logging.getLogger('regular').debug('data created = {0}'.format(date_created))

        # assign the chief author, ordinary author or principal investigator role to each author
        roles = assign_roles(authors)
        # check and assign whether the authors belong to the CHOP or PENN organization
        chop_organization, penn_organization = assign_organization(affiliations)

        mesh_description = ''
        if mesh_term is None:
            mesh_term = ''
        else:
            term, mesh_description = convert_mesh_description(mesh_description_dict, mesh_term)
            mesh_term = ';'.join(mesh_term)

        # output information
        if mesh_description:
            msg = print_str(pmid, term, mesh_description)
            medical_record_file.write(msg)

        for author_index, organizations in enumerate(zip(chop_organization, penn_organization)):
            if 1 in organizations:
                msg = print_str(pmid, authors[author_index], organizations[0], organizations[1], roles[author_index],
                                affiliations[author_index])
                author_record_file.write(msg)

        authors = ';'.join(authors)

        msg = print_str(pmid, title, abstract, year, month, authors, mesh_term, date)
        paper_record_file.write(msg)

        if record_index == 10:
            break

    # closing all open files
    medical_record_file.close()
    author_record_file.close()
    paper_record_file.close()

    logging.getLogger('line.regular.time.line').info('Recommendation System script finished running successfully.')
Пример #8
0
def main():
    # get the the path for the input file argument
    parser = argparse.ArgumentParser()
    parser.add_argument("-l",
                        "--log",
                        dest="logLevel",
                        choices=['DEBUG', 'INFO', 'ERROR'],
                        type=str.upper,
                        help="Set the logging level")
    args = parser.parse_args()

    logger_initialization(log_level=args.logLevel)

    logging.getLogger('line.regular.time.line').info(
        'Running Recommendation System script')

    # import data from file
    logging.getLogger('regular').info('reading data from file')

    # Entrez (http://www.ncbi.nlm.nih.gov/Entrez) is a data retrieval system that provides users access to NCBI’s
    # databases such as PubMed, GenBank, GEO, and many others
    # Use the mandatory email parameter so the NCBI can contact you if there is a proble
    Entrez.email = "*****@*****.**"  # Always tell NCBI who you are
    # logging.getLogger('regular').info('searching pubmed for the CHOP and UPENN authors')
    # handle = Entrez.esearch(db="pubmed", retmax=50000, idtype="esearch", mindate="2014/01/01", maxdate="2017/05/01",
    #                         term="Perelman School of Medicine[Affiliation] OR Children's Hospital of "
    #                              "Philadelphia[Affiliation] OR University of Pennsylvania School of "
    #                              "Medicine[Affiliation] OR School of Medicine University of Pennsylvania[Affiliation]",
    #                         usehistory="y")
    # search_results = Entrez.read(handle)
    # handle.close()
    # # obtaining the list of relevant PMIDs
    # id_list = search_results["IdList"]
    #
    # # get all the record based on the PMIDs
    # logging.getLogger('regular').info('getting relevant authors\' records based on PMIDs')
    # fetch_records_handle = Entrez.efetch(db="pubmed", id=id_list, retmode="text", rettype="medline")
    # # need to read all the data from the handle and store in a file because if we just read line by line from the
    # # generator and the internet connection is not strong, then we run into http errors:
    # # http.client.IncompleteRead: IncompleteRead(0 bytes read)
    # logging.getLogger('regular').info('storing authors\' records on local file')
    # with open("results.xml", "w") as out_handle:
    #     out_handle.write(fetch_records_handle.read(validate=True))
    # # the results are now in the results.xml file and the original handle has had all of its data extracted
    # # (so we close it)
    # fetch_records_handle.close()

    records_handle = open("results.xml")

    logging.getLogger('regular').info('creating parser record handle')
    # Use the Bio.Medline module to parse records
    fetch_records = Medline.parse(records_handle)

    # contains all the metadata elements on the author level: Pubmed unique Identifier number(PMID), AuthorID (as a
    # combination of the author’s last name, first name, and initials), institution: chop=0, Penn=1, Role: Chief Author
    # (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation
    author_record_df = pd.DataFrame(
        columns=['PMID', 'AuthorID', 'CHOP_PENN', 'ROLE', 'Affiliation'])
    # contains all the metadata elements on the paper level: Pubmed unique Identifier number(PMID), Title, Abstract,
    # Year, Month, AuthorList, SubjectList, date
    paper_record_df = pd.DataFrame(columns=[
        'PMID', 'Title', 'Abstract', 'Year', 'Month', 'AuthorList',
        'SubjectList', 'Date'
    ])
    # contains all the metadata of the medical information: Pubmed unique Identifier number(PMID), Primary Medical
    # Subject Header (MESH) and the description ID
    medical_record_df = pd.DataFrame(columns=['PMID', 'MESH', 'Description'])

    # get the description, related to the MESH, in the 2017MeshTree.csv File
    mesh_tree_file_object = open(
        r'C:\Users\GUERRAMARJ\PycharmProjects\Pubmed\template\2017MeshTree.csv'
    )
    file_reader = csv.reader(mesh_tree_file_object, delimiter=',')
    mesh_description_dict = dict()

    logging.getLogger('regular').info(
        'processing each record and obtaining relevant information')
    for line in file_reader:
        # split_line[0] = Number, split_line[1] = Description and split_line[2] = MESH
        mesh_description_dict[line[2]] = line[1]
    mesh_tree_file_object.close()

    # get the relevant information for each record
    for record_index, record in enumerate(fetch_records):
        logging.getLogger('regular').debug(
            'record index = {0}'.format(record_index))
        # initialize all the variables
        pmid = ''
        title = ''
        abstract = ''
        affiliation = ''
        author_id = ''
        role = ''
        mesh_term = ''

        try:
            pmid = record.get('PMID')
            title = record.get('TI')
            abstract = record.get('AB')

            logging.getLogger('regular').debug('pmid = {0}'.format(pmid))
            logging.getLogger('regular').debug('title = {0}'.format(title))
            logging.getLogger('regular').debug(
                'abstract = {0}'.format(abstract))
            # only used for debugging
            publication_type = record.get('PT')
            logging.getLogger('regular').debug(
                'publication type = {0}'.format(publication_type))

            # Note: Currently the record.get("AD") method returns a string regardless of the number of authors i.e. if
            # there are two author, it will return as a string both affiliations. As of result, this script has to
            # manually get the author information and their respective affiliations
            fetch_records_handle = Entrez.efetch(db="pubmed",
                                                 id=pmid,
                                                 retmode="xml")
            manual_record = Entrez.read(fetch_records_handle)
            try:
                if 'Book Chapter' in publication_type:
                    authors_list = manual_record['PubmedBookArticle'][0][
                        'BookDocument']['AuthorList']
                else:
                    # author_list for Pudmed Article
                    authors_list = manual_record['PubmedArticle'][0][
                        'MedlineCitation']['Article']['AuthorList']
            except:
                logging.getLogger('regular').debug(
                    'error while obtaining the authors\' list')
                continue

            for author_index, author in enumerate(authors_list):
                try:
                    affiliation = author['AffiliationInfo'][0]['Affiliation']
                    author_id = author['LastName'] + ', ' + author[
                        'ForeName'] + ', ' + author['Initials']

                    logging.getLogger('regular').debug(
                        'affiliation = {0}'.format(affiliation))
                    logging.getLogger('regular').debug(
                        'author id = {0}'.format(author_id))

                    # Assign the author organization
                    # 1 = chop, 0 = penn
                    chop_penn = None
                    if 'children' in affiliation.lower():
                        chop_penn = 1
                    elif 'perelman' in affiliation.lower() or 'school of medicine' in affiliation.lower() or  \
                            'pennsylvania' in affiliation.lower():
                        chop_penn = 0

                    logging.getLogger('regular').debug(
                        'chop_penn = {0}'.format(chop_penn))

                    # Assign the author's rle
                    # if less than 2 authors then they are considered "Chief Authors"
                    if author_index <= 1:
                        role = 'CA'
                    # If a person is after the first two authors and it'snt the last author its considered
                    # "Ordinary Author"
                    elif author_index > 1 and author_index != len(
                            authors_list):
                        role = 'OA'
                    # else "Principal Investigator)
                    elif author_index == len(authors_list):
                        role = 'PI'
                    else:
                        ValueError('Wrong author role specified')

                    logging.getLogger('regular').debug(
                        'role = {0}'.format(role))

                    if chop_penn is not None:
                        # insert the author information into the dataframe for later processing
                        author_record_df.loc[record_index] = [
                            pmid, author_id, chop_penn, role, affiliation
                        ]
                except (IndexError, KeyError):
                    # sometimes there wil be organizations on the authors list, in those cases, skip it
                    continue

            # Medical Subject Headings (MESH)
            # this can be a list
            mesh_term = record.get("MH")
            logging.getLogger('regular').debug(
                'mesh term = {0}'.format(mesh_term))
            if mesh_term is not None:
                # fetch the description from the description obtain from the 2017MeshTree file
                if len(mesh_term) > 1:

                    # because there are mesh_term that are not part of the 2017MeshTree, we have to loop through all
                    # of the mesh_term until one works i.e. the first one found in the 2017MeshTree
                    for mesh in mesh_term:

                        try:
                            term = mesh

                            print('term = {0}'.format(term))
                            # cleaning string
                            if '/' in term:
                                term = term.split('/')[0]
                            if '*' in term:
                                term = term.replace('*', '')

                            logging.getLogger('regular').debug(
                                'term = {0}'.format(term))

                            description = mesh_description_dict[term]

                        except KeyError:
                            logging.getLogger('regular').debug(
                                'not found term = {0}'.format(term))
                            continue

                # insert the values in the dataframe
                medical_record_df.append([pmid, mesh_term, description])

            # insert the paper information in the paper record dataframe
            # paper_record_df.append([pmid, title, abstract, year, month, authors_list, subject_list, date)

        except ValueError as error_message:
            msg = 'Problem while processing the following '
            print(msg)
            print('error message = {0}'.format(error_message))

    logging.getLogger('line.regular.time.line').info(
        'Recommendation System script finished running successfully.')
def main():
    parser = argparse.ArgumentParser(description='Grasp Assertiveness Script')
    parser.add_argument('-d',
                        '--directory',
                        help='dataset directory',
                        required=True)
    parser.add_argument("-l",
                        "--log",
                        dest="logLevel",
                        choices=['DEBUG', 'INFO', 'ERROR'],
                        help="Set the logging level")
    args = parser.parse_args()

    # check directory exists
    if not os.path.isdir(args.directory):
        msg = 'Directory = {0} not found.'.format(args.directory)
        raise IOError(msg)
    else:
        working_dir = args.directory

    logger_initialization(logger_dir=working_dir, parser=parser)

    logging.getLogger('time.info').info(
        'Running the Grasp Assertiveness Script')

    try:
        dataset_dir = os.path.join(working_dir, 'dataset.csv')
        dataset = pd.read_csv(dataset_dir)
    except IOError:
        msg = 'Could not find \'dataset.csv\' in directory {0}'.format(
            working_dir)
        logging.getLogger('info').error(msg)
        raise IOError(msg)

    # index the dataset based on the trial index
    indexed_dataset = dataset.set_index(keys=['trial_index'])

    random_state = 7
    testing_size = 0.10
    logging.getLogger('info').info('Obtaining training and testing datasets')
    x_train, x_test, y_train, y_test, train_indx_list, test_indx_list, train_cl_indx, test_cl_indx = train_test_split(
        indexed_dataset, test_size=testing_size, random_state=random_state)

    msg = 'training and testing parameters:\n\t\ttesting size = {0}, random state = {1}'.format(
        testing_size, random_state)
    logging.getLogger('tab.info').info(msg)

    # converting the index to int (to remove decimal) and the convert them to string to be able to print them all
    x_train_index = [str(int(indx)) for indx in set(x_train.index)]
    x_test_index = [str(int(indx)) for indx in set(x_test.index)]
    training_indices = ','.join(x_train_index)
    testing_indices = ','.join(x_test_index)
    msg = 'training indices:\n\t\t' + training_indices + '\n\ttesting indices:\n\t\t' + testing_indices
    logging.getLogger('tab.info.line').info(msg)

    training_dataset = x_train.values[train_cl_indx, :]
    training_labels = y_train.values[train_cl_indx]
    testing_dataset = x_test.values[test_cl_indx, :]
    testing_labels = y_test.values[test_cl_indx]

    logging.getLogger('info').info('Running SVM')
    # training and testing on time-independent dataset
    clf = svm.SVC()
    clf.fit(training_dataset, training_labels)
    svm_score = clf.score(testing_dataset, testing_labels)

    msg = 'SVM score = {0}'.format(svm_score)
    logging.getLogger('tab.info').info(msg)

    logging.getLogger('info').info('Running HMM')
    n_pos_components = [2, 5, 7, 15]
    cov_types = ['diag', 'tied', 'spherical']
    n_iterations = [5, 10, 20, 50]
    for nc in n_pos_components:
        for cov in cov_types:
            for _iter in n_iterations:

                try:
                    msg = 'running HMM with the following parameters'
                    logging.getLogger('time.info').info(msg)
                    msg = 'number of states = {0}, type of covariance = {1}, number of iterations = {2}'.format(
                        nc, cov, _iter)
                    logging.getLogger('tab.info').info(msg)
                    # training and testing on time-dependent dataset
                    hmm_model = hmm.GaussianHMM(n_components=nc,
                                                random_state=random_state,
                                                covariance_type=cov,
                                                n_iter=_iter)
                    hmm_model.fit(x_train, lengths=train_indx_list)

                    # training hmm and logistic regression
                    hmm_training_predictions = hmm_model.predict(
                        x_train, lengths=train_indx_list)

                    hmm_training_predictions_reshaped, labels_processed = reshape_predictions(
                        predictions=hmm_training_predictions, labels=y_train)

                    msg = 'running Logistic Regression'
                    logging.getLogger('tab.time.info').info(msg)

                    # mapping hmm labels to true labels
                    logistic_regression_model = LogisticRegression()
                    logistic_regression_model.fit(
                        X=hmm_training_predictions_reshaped,
                        y=labels_processed)

                    # predictions on testing dataset
                    hmm_testing_predictions = hmm_model.predict(
                        x_test, lengths=test_indx_list)
                    hmm_testing_prediction_reshaped, testing_labels_processed = reshape_predictions(
                        predictions=hmm_testing_predictions, labels=y_test)
                    time_score = logistic_regression_model.score(
                        X=hmm_testing_prediction_reshaped,
                        y=testing_labels_processed)
                    msg = 'HMM-Logistic Regression score = {0}'.format(
                        time_score)
                    logging.getLogger('tab.time.info').info(msg)

                except ValueError as error_message:
                    msg = 'Error while processing the following parameters ' \
                          '\n\t\tnumber of states = {0}, type of covariance = {1}, number of iterations = {2}'.format(
                            nc, cov, _iter)
                    logging.getLogger('tab.info').error(msg)
                    msg = 'error message = {0}'.format(error_message)
                    logging.getLogger('tab.tab.info').error(msg)
                    pass

                msg = 'finished running HMM'
                logging.getLogger('time.info').info(msg)

    logging.getLogger('time.info').info(
        'Finished running the Grasp Assertiveness Script')
Пример #10
0
    """
    end program
    return : exit 0
    """
    msg = 'Program terminated.\n'
    logging.getLogger('').info(msg)
    exit(0)


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("-l", "--log", dest="logLevel", choices=['DEBUG', 'INFO', 'ERROR'],
                        help="Set the logging level")

    logger_initialization(parser=parser)

    selected_option = -1
    while selected_option != 9:

        # options
        print ''
        print 'Program Menu:\n'
        print '1: Gaussian-HMM program'
        print '2: GMM-HMM program'
        print '3: Check Matlab files'
        print '4: Convert Matlab files to hdf5 format file'
        print '5: Process matlab files with basic features'
        print '6: Move Matlab files from Dropbox to Working Directory'
        print '7: Perform Logistic Regression'
        print '8: Perform LSTM'